Set working directory

setwd("~/Desktop/working-with-lyle/formality_project")
if (!require("pacman")) install.packages("pacman") #run this if you don't have pacman 
library(pacman)
pacman::p_load(tidyverse,rlang, zoo, lubridate, plotrix, ggpubr, caret, broom, kableExtra, reactable, install = T) 
#use pacman to load packages quickly 

Set plot aesthetics

palette_map = c("#3B9AB2", "#EBCC2A", "#F21A00")
palette_condition = c("#ee9b00", "#bb3e03", "#005f73")

plot_aes = theme_classic() +
  theme(legend.position = "top",
        legend.text = element_text(size = 12),
        text = element_text(size = 16, family = "Futura Medium"),
        axis.text = element_text(color = "black"),
        axis.line = element_line(colour = "black"),
        axis.ticks.y = element_blank())

Set table function

 table_model = function(model_data,reference = "Intercept") {
   model_data %>% 
     tidy() %>% 
     rename("SE" = std.error,
            "t" = statistic,
            "p" = p.value) %>%
     mutate(term = gsub("\\(Intercept\\)", !!reference, term),
            term = gsub("Date", "Original Publication Date", term)) %>%
     kable() %>% 
     kableExtra::kable_styling()
   
 }

Load in and clean data

df <- read_csv('https://raw.githubusercontent.com/scm1210/Language_Lab_Repro/main/Atlantic_Cleaned_all_vars.csv') #read in the data


#screen outliers
df[,c("Analytic_scaled", "WPS_scaled", "BigWords_scaled","Period_scaled","readability_scaled","grade_level_scaled",'i_scaled','we_scaled','pronoun_scaled','article_scaled','cogproc_scaled','Apostro_scaled',"Conversation_scaled", 'det_scaled','syllables_per_word_scaled','syllables_per_sentence_scaled')] <- lapply(df[,c("Analytic","WPS","BigWords","Period","readability","grade_level",'i','we','pronoun','article','cogproc','Apostro',"Conversation",'det','syllables_per_word','syllables_per_sentence')], scale)

df <- subset(df, abs(Analytic_scaled) <= 3 & abs(WPS_scaled) <= 3 & abs(BigWords_scaled) <= 3 
             & abs(Period_scaled) <= 3 & abs(readability_scaled) <= 3  & abs(grade_level_scaled) <= 3 & abs(i_scaled)
             <= 3 & abs(we_scaled) <= 3 & abs(pronoun_scaled) <= 3 & abs(article_scaled) <= 3 & abs(cogproc_scaled) <= 3 & abs(Apostro_scaled) & abs(Conversation_scaled) & abs(det_scaled))



df <- df %>% filter(readability<=120) %>% #filter out impossible values
  filter(readability>=0) %>% 
  filter(grade_level>=0) %>% 
    filter(grade_level<=18) %>% 
  filter(Period>0) %>% 
  filter(Period<=20) %>% 
  filter(WPS<145)

Tidy the data and center variables

Flesch-Kincaid Description

Flesch-Kincaid Ease of Readability: higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read.

The Flesch–Kincaid Grade Level Score: presents a score as a U.S. grade level, making it easier for teachers, parents, librarians, and others to judge the readability level of various books and texts.

Corpus Summary Stats

The following corpus consists of 42,528 articles ranging from 1857 to 2022.

Dates

df %>% 
  select(Date) %>% 
  range()
## [1] 1857 2022

Raw count of Articles

Number arrived at after filtering out outliers and duplicates

df %>%
  select(Filename) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)

Number of Articles per Year

articles_year <- df %>%
  select(Filename,Date) %>%
  unique() %>%
  group_by(Date) %>%
  dplyr::summarize(n = n()) %>%
  reactable::reactable(striped = TRUE)
 articles_year

Flesch-Kincaid Graphs

Please see attached files for the graphs if needed.

Plotting the smoothed data by year

readability_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=readability_mean, group=1)) +
  ggtitle("Readability") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.60 )+ 
  plot_aes +
  labs(x = "Year", y = 'Ease of Readability') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold"))

grade_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=grade_level_mean, group=1)) +
  ggtitle("Grade Level") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.80 )+ 
  plot_aes +
  labs(x = "Year", y = 'Grade Level Score') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold"))

syllables_per_word_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=syllables_per_word_mean, group=1)) +
  ggtitle("Syllables per word") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.80 )+ 
  plot_aes +
  labs(x = "Year", y = 'Syllables per word') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold"))

syllables_per_sentence_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=syllables_per_sentence_mean, group=1)) +
  ggtitle("Syllables per sentence") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.80 )+ 
  plot_aes +
  labs(x = "Year", y = 'Syllables per sentence') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold"))

syllables_per_word_smooth_tidy <- ggplot(data=tidy_df, aes(x=Date, y=syllables_per_word_mean, group=1)) +
  ggtitle("Syllables per word") +
  geom_point(color = "dodgerblue3", alpha = 0.7) + 
  geom_smooth(method = "loess", span = 0.80 )+ 
  plot_aes +
  labs(x = "Year", y = 'Syllables per word') +
  theme(axis.text.x=element_text(angle=45, hjust=1), 
        plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) + 
  theme(axis.text=element_text(size=16),
        axis.title=element_text(size=20,face="bold"))+
  theme(plot.title.position = 'plot', 
        plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
  theme(axis.text=element_text(size = 14),
        axis.title=element_text(size = 20,face="bold"))


tidy_smooth_graphs <- ggpubr::ggarrange(readability_smooth_tidy,grade_smooth_tidy,
                                        syllables_per_word_smooth_tidy,
                                        ncol=1, nrow=3, common.legend = TRUE, legend = "bottom")
annotate_figure(tidy_smooth_graphs,
                top = text_grob("Atlantic Flesch-Kincaid and Syllables",  color = "black", face = "bold", size = 20))

Individual Graphs

readability_smooth_tidy 

grade_smooth_tidy

syllables_per_sentence_smooth_tidy

syllables_per_word_smooth_tidy

Build Simple Regression Models

Ease of Readability

Model presented is centered on means for first year in the dataset.

#Centered 
Readability_centered <- lm(readability_centered ~ Date, data = tidy_df)

table_model(Readability_centered)
term estimate SE t p
Intercept 46.4160 4.9999 9.283 0
Original Publication Date -0.0261 0.0026 -10.144 0

Grade Level Reading

Model presented is centered on means for first year in the dataset.

#Centered 

Grade_centered <- lm(grade_level_centered ~ Date, data = tidy_df)

table_model(Grade_centered)
term estimate SE t p
Intercept 6.7569 1.1769 5.741 0
Original Publication Date -0.0033 0.0006 -5.383 0

Syllables per Words

syllables_per_word_centered <- lm(syllables_per_word_centered ~ Date, data = tidy_df)
table_model(syllables_per_word_centered)
term estimate SE t p
Intercept -1.1875 0.0594 -19.99 0
Original Publication Date 0.0006 0.0000 21.00 0